This is the fourth installment of Applying Machine Learning to Kaggle Datasets, a series of ipython notebooks demonstrating the methods described in the Stanford Machine Learning Course. In each noteobok, I apply one method taught in the course to an open kaggle competition.
In this notebook, I demonstrate support vector machines using the Titanic competition.
In [270]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
#import code.SVM_Funcs as svm
import sklearn.svm as skm
In [271]:
data = pd.read_csv("./data/titanic/train.csv", index_col="PassengerId")
data.head()
Out[271]:
In [272]:
def prepdata(data):
"""
Prepare data for use in scikit-learn SVM routines.
# Sklearn.svm.SVC is the function for support vector machine classicifaction
# results = svm.SVC(data,target)
# datain: ndarray: nobs x nfeatures
# target: ndarray: nobs,
"""
# Input data.
# a. Gather numerical inputs, fill nan values to mean of that category
numcols = ['Age','Fare','Parch','SibSp']
datain = data[numcols].copy()
for col in datain.columns:
datain[col].fillna(value=datain[col].mean(),inplace=True)
# b. Trim outliers from numerical inputs
#datain['Age'][datain['Age']>70] = 70 # Cap age at 70
#datain['Fare'][datain['Fare']>100] = 100
#datain['Parch'][datain['Parch']>2] = 2
#datain['SibSp'][datain['SibSp']>4] = 4
# c. Gather categorical variables as dummy variables of len(# of categories)
catcols = ['Pclass','Sex','Cabin','Embarked','Name']
for col in catcols:
# Make copy of categorical column to process
column = data[col].copy()
# Extract cabin level from Cabin number
if col=='Cabin':
column = pd.Series(['NaN' if val is np.nan else val[0] for val in column],index=data.index)
# Cabins T and G have very few occupants (1 and 4, respectively)
column[column.isin(['T','G'])] = 'NaN'
# Extract title from Name
if col=='Name':
column = pd.Series([name.split('.')[0].split(' ')[-1] for name in column],index=data.index)
# Only include titles of those with more than 2% occurance in the training data
#titles = column.value_counts().keys()[column.value_counts().values>0.02*len(data)]
titles = ['Master','Mr','Mrs','Miss'] # manually enter so it works with test data
column[column.isin(titles)==False] = 'Other'
dummies = pd.get_dummies(column,prefix=col)
datain = datain.join(dummies)
# Return datain and target to calling program
return datain
datain = prepdata(data)
target = data['Survived']
# Linearly scale all non-categorical columns to range [-1,1]
numcols = ['Age','Fare','Parch','SibSp']
for col in numcols:
# Zero mean, unit variance
#datain[col] = (datain[col] - datain[col].mean()) / datain[col].std()
# Range [-1,1]
datain[col] = (datain[col]-datain[col].min())*2./(datain[col].max()-datain[col].min())-1.
In [273]:
def crossvalidate(k,model,datain,target):
"""
Perform k-fold cross validation on the model trained
with subsets of the input data (datain) and the
output labels (target).
Return the fraction of the test set correctly
predicted during the cross validation.
"""
# Dictionary to hold cross-validation results
validation = {'prediction':np.array([]),
'target':np.array([])}
# Loop over data set k times
if k>1:
for ii in range(k):
# Split data into training and test data
indices = datain.index[ii*len(datain)/k:(ii+1)*len(datain)/k]
testin = datain[datain.index.isin(indices)]
trainin = datain[~datain.index.isin(indices)]
testout = target[datain.index.isin(indices)]
trainout = target[~datain.index.isin(indices)]
# Train model on the training data
results = model.fit(trainin,trainout)
validation['prediction'] = np.append(validation['prediction'],results.predict(testin),0)
validation['target'] = np.append(validation['target'],testout,0)
else:
results = model.fit(datain,target)
validation['prediction'] = results.predict(datain)
validation['target'] = target
return 1-sum((validation['prediction']-validation['target'])!=0)/float(len(datain))
In [281]:
# Perform k-fold cross validation
k = 3
# Define powers of C and gamma to test
Cpow = np.linspace(-5,12,8)
Gpow = np.linspace(-15,3,8)
# Loop over each combination of C and gamma
score = pd.DataFrame(index=Cpow,columns=[gp for gp in Gpow])
score.index.name = 'Cvalues'
for cp in Cpow:
#print "C = {}".format(cp)
for gp in Gpow:
#print r" $\gamma = ${}".format(gp)
model = skm.SVC(kernel='rbf',C=2**cp,gamma=2**gp)#,class_weight='auto')
score.loc[cp,gp] = crossvalidate(k,model,datain,target)
In [282]:
# Visualize results of coarse grid search to refine grid
C,G = np.meshgrid(Cpow,Gpow)
CF = plt.contourf(C,G,score.T,
levels=np.linspace(0.8,0.84,21),
cmap=plt.cm.jet,vmin=0.78,vmax=0.84,extend='min')
plt.colorbar(CF)
plt.title('Results of Coarse Grid Search',fontsize=24)
plt.ylabel(r'$log_2 \gamma$',fontsize=18)
plt.xlabel(r'$log_2 C$',fontsize=18)
Out[282]:
In [287]:
# Set new values of C and gamma based on figure above
Cpow = np.linspace(0,6,11)
Gpow = np.linspace(-6,0,11)
# Loop over each combination of C and gamma
score = pd.DataFrame(index=Cpow,columns=[gp for gp in Gpow])
score.index.name = 'Cvalues'
for cp in Cpow:
#print "C = {}".format(cp)
for gp in Gpow:
#print r" $\gamma = ${}".format(gp)
model = skm.SVC(kernel='rbf',C=2**cp,gamma=2**gp)#,class_weight='auto')
score.loc[cp,gp] = crossvalidate(k,model,datain,target)
In [288]:
# Visualize results of coarse grid search to refine grid
C,G = np.meshgrid(Cpow,Gpow)
CF = plt.contourf(C,G,score.T,
levels=np.linspace(0.8,0.84,21),
cmap=plt.cm.jet,vmin=0.78,vmax=0.84,extend='min')
plt.colorbar(CF)
plt.title('Results of Refined Grid Search',fontsize=24)
plt.ylabel(r'$log_2 \gamma$',fontsize=18)
plt.xlabel(r'$log_2 C$',fontsize=18)
Out[288]:
In [289]:
# Set new values of C and gamma based on figure above
Cpow = np.linspace(3,6,11)
Gpow = np.linspace(-4,-1,11)
# Loop over each combination of C and gamma
score = pd.DataFrame(index=Cpow,columns=[gp for gp in Gpow])
score.index.name = 'Cvalues'
for cp in Cpow:
#print "C = {}".format(cp)
for gp in Gpow:
#print r" $\gamma = ${}".format(gp)
model = skm.SVC(kernel='rbf',C=2**cp,gamma=2**gp)#class_weight='auto')
score.loc[cp,gp] = crossvalidate(k,model,datain,target)
In [290]:
# Visualize results of coarse grid search to refine grid
C,G = np.meshgrid(Cpow,Gpow)
CF = plt.contourf(C,G,score.T,
levels=np.linspace(0.8,0.84,21),
cmap=plt.cm.jet,vmin=0.78,vmax=0.84,extend='min')
plt.colorbar(CF)
plt.title('Results of Ultra-Refined Grid Search',fontsize=24)
plt.ylabel(r'$log_2 \gamma$',fontsize=18)
plt.xlabel(r'$log_2 C$',fontsize=18)
Out[290]:
In [291]:
# Select final C and gamma values
C = 2**3.8 # with class_weight = 'auto' C = 2**3.1
g = 2**-1.5 # with class_weight = 'auto' g = 2**-2.0
# Train final model with all the training data
model = skm.SVC(kernel='rbf',C=C,gamma=g)#,class_weight='auto')
results = model.fit(datain,target)
In [292]:
# Read test data from file
test = pd.read_csv("./data/titanic/test.csv",index_col="PassengerId")
# Make
testin = prepdata(test)
# Linearly scale all non-categorical columns to range [-1,1],
# using scaling from the training data.
tempdata = prepdata(data)
numcols = ['Age','Fare','Parch','SibSp']
#tempdata = data[numcols].copy()
for col in numcols:
tempdata[col].fillna(value=tempdata[col].mean(),inplace=True)
# Zero mean, unit variance
#testin[col] = (testin[col] - tempdata[col].mean()) / tempdata[col].std()
# Range [-1,1]
testin[col] = (testin[col]-tempdata[col].min())*2./(tempdata[col].max()-tempdata[col].min())-1.
# Make predictions using the test data from Kaggle
predictions = pd.DataFrame(results.predict(testin),index=testin.index,columns=['Survived'])
predictions = predictions.astype(int)
predictions.to_csv('./predictions/SVM_Prediction.csv',sep=',')
print "Score on Training Data = {}".format(1-sum((results.predict(datain)-target)!=0)/float(len(datain)))
This results in a final score of 0.77990 on the public leaderboard.
In [ ]: